import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree
# Load the dataset
df = pd.read_csv('C:\datasets\coronavirusdataset.csv')
# Examine the structure of the dataset
print(df.head())
print(df.info())
batch_date test_name swab_type covid19_test_results \
0 2020-10-20 SARS-CoV-2, NAA Nasal Negative
1 2020-10-20 COVID-19 PCR External Result Nasal Negative
2 2020-10-20 Rapid COVID-19 PCR Test Nasal Negative
3 2020-10-20 Rapid COVID-19 PCR Test Nasal Negative
4 2020-10-20 Rapid COVID-19 PCR Test Nasal Negative
age high_risk_exposure_occupation high_risk_interactions diabetes chd \
0 39 False NaN False False
1 56 False NaN False False
2 35 False NaN False False
3 37 False NaN False False
4 42 False NaN False False
htn ... headache loss_of_smell loss_of_taste runny_nose \
0 False ... False False False False
1 False ... False False False False
2 False ... False False False False
3 False ... False False False False
4 False ... False False False False
muscle_sore sore_throat cxr_findings cxr_impression cxr_label cxr_link
0 False False NaN NaN NaN NaN
1 False False NaN NaN NaN NaN
2 False False NaN NaN NaN NaN
3 False False NaN NaN NaN NaN
4 False False NaN NaN NaN NaN
[5 rows x 45 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7294 entries, 0 to 7293
Data columns (total 45 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 batch_date 7294 non-null object
1 test_name 7294 non-null object
2 swab_type 7294 non-null object
3 covid19_test_results 7294 non-null object
4 age 7294 non-null int64
5 high_risk_exposure_occupation 7294 non-null bool
6 high_risk_interactions 2727 non-null object
7 diabetes 7294 non-null bool
8 chd 7294 non-null bool
9 htn 7294 non-null bool
10 cancer 7294 non-null bool
11 asthma 7294 non-null bool
12 copd 7294 non-null bool
13 autoimmune_dis 7294 non-null bool
14 smoker 7294 non-null bool
15 temperature 1869 non-null float64
16 pulse 1866 non-null float64
17 sys 1727 non-null float64
18 dia 1727 non-null float64
19 rr 1544 non-null float64
20 sats 1869 non-null float64
21 rapid_flu_results 6 non-null object
22 rapid_strep_results 11 non-null object
23 ctab 1288 non-null object
24 labored_respiration 1963 non-null object
25 rhonchi 723 non-null object
26 wheezes 961 non-null object
27 days_since_symptom_onset 591 non-null float64
28 cough 7294 non-null bool
29 cough_severity 178 non-null object
30 fever 3137 non-null object
31 sob 7294 non-null bool
32 sob_severity 82 non-null object
33 diarrhea 7294 non-null bool
34 fatigue 7294 non-null bool
35 headache 7294 non-null bool
36 loss_of_smell 7294 non-null bool
37 loss_of_taste 7294 non-null bool
38 runny_nose 7294 non-null bool
39 muscle_sore 7294 non-null bool
40 sore_throat 7294 non-null bool
41 cxr_findings 7 non-null object
42 cxr_impression 7 non-null object
43 cxr_label 7 non-null object
44 cxr_link 7 non-null object
dtypes: bool(19), float64(7), int64(1), object(18)
memory usage: 1.6+ MB
None
# Handling missing values
df.fillna(0, inplace=True)
# Dropping irrelevant columns
df.drop(columns=['batch_date', 'test_name', 'swab_type'], inplace=True)
# Convert 'covid19_test_results' to numerical format (assuming it contains 'Positive' and 'Negative')
df['covid19_test_results'] = df['covid19_test_results'].map({'Positive': 1, 'Negative': 0})
# For numerical columns, we can fill missing values with the mean of the column
numerical_columns = df.select_dtypes(include=['float64']).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
# For categorical columns, we can fill missing values with a default category or the most frequent category
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].fillna('Unknown')
# Convert categorical columns to numerical using one-hot encoding
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
C:\Users\ENVIRONMENT-5\anaconda3\lib\site-packages\pandas\core\algorithms.py:798: FutureWarning: In a future version, the Index constructor will not infer numeric dtypes when passed object-dtype sequences (matching Series behavior) uniques = Index(uniques)
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['covid19_test_results'])
y = df['covid19_test_results']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9988003427592117 Accuracy on testing dataset 0.997943797121316
# Create and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9962296486718081 Accuracy on testing dataset 0.997943797121316
print("Feature importance: \n", model.feature_importances_)
Feature importance: [0.04 0.01 0.05 0.07 0.02 0. 0. 0. 0. 0. 0.1 0.05 0.02 0.03 0.01 0.06 0.04 0. 0.01 0.04 0.03 0.03 0.1 0.04 0.04 0.02 0.06 0.01 0. 0. 0. 0. 0.02 0.01 0. 0.02 0. 0.06 0. 0.01 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['covid19_test_results'])
y = df['covid19_test_results']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Gradient Boosting Classifier
model = GradientBoostingClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9984575835475579 Accuracy on testing dataset 0.995887594242632
# Create and train the Gradient Boosting Classifier
model = GradientBoostingClassifier(n_estimators=100, random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9965724078834619 Accuracy on testing dataset 0.997943797121316
# Create and train the Gradient Boosting Classifier
model = GradientBoostingClassifier(n_estimators=100, random_state=0, learning_rate=0.01, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.996401028277635 Accuracy on testing dataset 0.9972583961617546
print("Feature importance: \n", model.feature_importances_)
Feature importance: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.7774512 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.2225488 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
# Create and train the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9988003427592117 Accuracy on testing dataset 0.9890335846470185
# Create and train the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.996401028277635 Accuracy on testing dataset 0.9972583961617546
print("Feature importance: \n", model.feature_importances_)
Feature importance: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
from sklearn import tree
fig = plt.figure(figsize=(25, 15))
_ = tree.plot_tree(model, filled=True, rounded=True,
feature_names=X.columns,
class_names=["age", "temperature"])
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()